{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "zLxBMw9Lsr6I",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install datasets tqdm lemminflect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "qyuuLNEzsaYR"
   },
   "outputs": [],
   "source": [
    "import gzip\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "from collections import defaultdict\n",
    "from datasets import load_dataset\n",
    "from tqdm.auto import tqdm\n",
    "from random import random, randint\n",
    "from lemminflect import getAllInflections, getLemma\n",
    "\n",
    "ONE_STEP_OUPUT_CODE_TEMPLATES = [\n",
    "    # VBZ\n",
    "    \"Can you write a program in {lang} where it\\n\",\n",
    "    \"How would you implement a function in {lang} that\\n\",\n",
    "    \"Can you create a {lang} program that\\n\",\n",
    "    \"Can you implement a function in {lang} that\\n\",\n",
    "    # VBP\n",
    "    \"Implement a function in {lang} to\\n\",\n",
    "    \"How would you code a program in {lang} to\\n\",\n",
    "    \"Write a {lang} script to\\n\",\n",
    "    \"Create a {lang} function to\\n\",\n",
    "    \"Write a {lang} program that can\\n\",\n",
    "    # VBG\n",
    "    \"Write a {lang} script for\\n\",\n",
    "    \"Write a {lang} function for\\n\",\n",
    "    \"Create a {lang} function for\\n\",\n",
    "    \"Implement a {lang} function for\\n\",\n",
    "]\n",
    "\n",
    "ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [\n",
    "    # General answer\n",
    "    \"Explain what the following {lang} code does\\n\",\n",
    "    \"Can you tell what is the following {lang} function doing\\n\",\n",
    "    \"Here you have a function in {lang}, explain what it does\\n\",\n",
    "    \"Make a summary of the following {lang} code\\n\",\n",
    "    \"Can you generate a brief explanation for the following {lang} code\\n\",\n",
    "    \"How would you explain what the following {lang} function does\\n\",\n",
    "    # Documentation\n",
    "    \"Can you generate the documentation for the following {lang} function\\n\",\n",
    "    \"Create a docstring for the following {lang} code\\n\",\n",
    "    \"Given the following {lang} function, write the documentation\\n\",\n",
    "    \"Write a docstring for the following {lang} function\\n\",\n",
    "]\n",
    "\n",
    "\n",
    "def remove_docstring(code_function):\n",
    "    triple_quotes = '\"\"\"'\n",
    "    lines = code_function.split(\"\\n\")\n",
    "\n",
    "    c = lines[1].count(triple_quotes)\n",
    "    # There is no docstring\n",
    "    if c == 0:\n",
    "        return code_function\n",
    "    # One line dostring\n",
    "    if c == 2:\n",
    "        return \"\\n\".join([lines[0]] + lines[2:])\n",
    "\n",
    "    idx = 2\n",
    "    while idx < len(lines) and triple_quotes not in lines[idx]:\n",
    "        idx += 1\n",
    "\n",
    "    return \"\\n\".join([lines[0]] + lines[idx + 1 :])\n",
    "\n",
    "\n",
    "def process_summary(summary, tag):\n",
    "    words = summary.split()\n",
    "    lemma = getLemma(words[0].lower(), upos=\"VERB\")[0]\n",
    "    inflections = getAllInflections(lemma)\n",
    "\n",
    "    if tag not in inflections:\n",
    "        words[0] = words[0].lower()\n",
    "    else:\n",
    "        words[0] = inflections[tag][0]\n",
    "\n",
    "    return \" \".join(words)\n",
    "\n",
    "\n",
    "lang = \"Python 3\"\n",
    "data = defaultdict(list)\n",
    "dataset = load_dataset(\"Nan-Do/code-search-net-python\")\n",
    "\n",
    "for data_point in tqdm(dataset[\"train\"]):\n",
    "    code = data_point[\"original_string\"]\n",
    "    summary = data_point[\"summary\"]\n",
    "    data[\"SOURCE\"].append(\"codesearchnet\")\n",
    "    # Generate code\n",
    "    if random() > 0.5:\n",
    "        idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)\n",
    "        if 0 <= idx <= 3:\n",
    "            tag = \"VBZ\"\n",
    "        elif 4 <= idx <= 8:\n",
    "            tag = \"VBP\"\n",
    "        else:\n",
    "            tag = \"VBG\"\n",
    "        summary = process_summary(summary, tag)\n",
    "        template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary\n",
    "        data[\"INSTRUCTION\"].append(template)\n",
    "        data[\"RESPONSE\"].append(code)\n",
    "    # Generate summary\n",
    "    else:\n",
    "        # We are generating the docstring or a summary so we better remove it from\n",
    "        # the function\n",
    "        # if random() < 0.9:\n",
    "        #    code = remove_docstring(code)\n",
    "        code = remove_docstring(code)\n",
    "        idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)\n",
    "        template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code\n",
    "        data[\"INSTRUCTION\"].append(template)\n",
    "        if idx <= 5:\n",
    "            data[\"RESPONSE\"].append(summary)\n",
    "        else:\n",
    "            data[\"RESPONSE\"].append('\"\"\"' + summary + '\"\"\"')\n",
    "\n",
    "df = pd.DataFrame(data=data)\n",
    "df.to_parquet(\"instructional_dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_6jaUZRsy1-R"
   },
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "DSHrvbF6tIyd"
   },
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "ds = Dataset.from_parquet(\"instructional_dataset.parquet\")\n",
    "ds.push_to_hub(\"Nan-Do/instructional_code-search-net-python\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
